﻿# coding:utf-8
import sys
import os
import url_manager
import html_downloader
import html_parser
import html_outputer
import threading

class SpiderMain(object):
	def __init__(self):
		self.purlm = url_manager.PageUrlManager()
		self.downloader = html_downloader.HtmlDownloader()
		self.parser = html_parser.HtmlParser()
		self.dir = 'temp/'
	
	def get_category_id(self,category_url):
		return category_url[34:]
	
	def mk_file(self):
		if not os.path.exists(self.dir):
			os.mkdir(self.dir)
	
	def craw(self, category_url):
		category_id = self.get_category_id(category_url)
		category,page_num = self.parser.home_page_msg(self.downloader.download(category_url))
		print 'category: %s ,page_num : %d' %(category,page_num)
		self.purlm.add_new_urls(category_url, page_num)
		
		self.mk_file()
		filename = self.dir + category_id+'.txt'
		print filename
		outputer = html_outputer.HtmlOutputer(filename)
		while self.purlm.has_new_url():
			page_url = self.purlm.get_new_url()
			detail_urls = self.parser.detail_page_url(self.downloader.download(page_url))
			outputer.output_detail_url(detail_urls)
		outputer.close_file()
		print '-- output detail url over--'
		self.spider(category,filename)
	
	def spider(self, category ,filename):
		db_outputer = html_outputer.DbOutputer()
		db_outputer.load()
		log = html_outputer.ErrorLogOutput(self.dir+'errlog.txt')
		f = open(filename)
		line = f.readline()
		while line :
			try:
				#print 'craw: %s' %line
				d = self.parser.app_detail_msg(self.downloader.download(line))
				app_pkg = line[30:]
				d['app_category'] = category
				d['app_pkg'] = app_pkg
				d['app_url'] = line
				db_outputer.output(d)
			except Exception,e:
				print e
				log.output(line)
				continue
			finally:
				line = f.readline()
		print '---output app msg over---'
		f.close()
		db_outputer.close()
		log.close()


spider = SpiderMain()
threads = []
t1 = threading.Thread(target=spider.craw,args=('http://www.wandoujia.com/category/386',))
threads.append(t1)
t2 = threading.Thread(target=spider.craw,args=('http://www.wandoujia.com/category/388',))
threads.append(t2)
t3 = threading.Thread(target=spider.craw,args=('http://www.wandoujia.com/category/410',))
threads.append(t3)
t4 = threading.Thread(target=spider.craw,args=('http://www.wandoujia.com/category/412',))
threads.append(t4)
t5 = threading.Thread(target=spider.craw,args=('http://www.wandoujia.com/category/414',))
threads.append(t5)




if __name__=='__main__':
	# if len(sys.argv)==1:
		# print '参数不正确'
		# exit()
	# category_url = sys.argv[1]
	#spider = SpiderMain()
	#spider.craw(category_url)
	for t in threads:
		t.setDaemon(True)
		t.start()
	t.join()
	
	
# 旅游出行 -- http://www.wandoujia.com/category/408 
# 金融理财 -- http://www.wandoujia.com/category/398 
# 视频 -- http://www.wandoujia.com/category/382 
# 购物 -- http://www.wandoujia.com/category/390 
# 音乐 -- http://www.wandoujia.com/category/384  ----
# 图像 -- http://www.wandoujia.com/category/386 
# 新闻阅读 -- http://www.wandoujia.com/category/388 
# 生活实用工具 -- http://www.wandoujia.com/category/410 
# 系统工具 -- http://www.wandoujia.com/category/412 
# 美化手机 -- http://www.wandoujia.com/category/414   ----
# 效率办公 -- http://www.wandoujia.com/category/416 
# 聊天社交 -- http://www.wandoujia.com/category/402 
# 电话通讯 -- http://www.wandoujia.com/category/404 
# 交通导航 -- http://www.wandoujia.com/category/406 
# 生活服务 -- http://www.wandoujia.com/category/392 
# 运动健康 -- http://www.wandoujia.com/category/394 
# 教育培训 -- http://www.wandoujia.com/category/396 
# 丽人母婴 -- http://www.wandoujia.com/category/400 
# 休闲时间 -- http://www.wandoujia.com/category/241 
# 跑酷竞速 -- http://www.wandoujia.com/category/253 
# 宝石消除 -- http://www.wandoujia.com/category/237 
# 网络游戏 -- http://www.wandoujia.com/category/245 
# 动作射击 -- http://www.wandoujia.com/category/251 
# 扑克棋牌 -- http://www.wandoujia.com/category/243 
# 儿童益智 -- http://www.wandoujia.com/category/255 
# 塔防守卫 -- http://www.wandoujia.com/category/239 
# 体育格斗 -- http://www.wandoujia.com/category/257 
# 角色扮演 -- http://www.wandoujia.com/category/247 
# 经营策略 -- http://www.wandoujia.com/category/249